# 데이터 분석
import pandas as pd
import numpy as np
import random as rnd
import csv
from collections import Counter
# 시각화
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
# 기계 학습
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
#데이터 불러오기
transac_data = pd.read_csv('transactions.csv')
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
songs_data = pd.read_csv('songs.csv')
transac_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1819855 entries, 0 to 1819854 Data columns (total 6 columns): # Column Dtype --- ------ ----- 0 user_id int64 1 song_id int64 2 rec_loc object 3 rec_screen object 4 entry object 5 listen int64 dtypes: int64(3), object(3) memory usage: 83.3+ MB
transac_data.head()
| user_id | song_id | rec_loc | rec_screen | entry | listen | |
|---|---|---|---|---|---|---|
| 0 | 5 | 5 | discover | Online playlist more | online-playlist | 0 |
| 1 | 7 | 7 | search | Search | song | 1 |
| 2 | 14 | 14 | discover | Online playlist more | song-based-playlist | 0 |
| 3 | 22 | 22 | my library | Local playlist more | local-library | 0 |
| 4 | 24 | 24 | search | Online playlist more | online-playlist | 0 |
transac_data['user_id'] = transac_data['user_id'].astype(object)
transac_data['song_id'] = transac_data['song_id'].astype(object)
transac_data['listen'] = transac_data['listen'].astype(object)
rec_data = transac_data['song_id'].value_counts()
rec_data
98 7254
38 7085
266 6865
150 6741
412 6409
...
8781 48
9049 47
7880 45
7421 45
8535 40
Name: song_id, Length: 4878, dtype: int64
#가장 추천이 많이 된 곡
songs_data[songs_data['song_id'] == 98]
| song_id | length | genre | artist | composer | lyricist | language | isrc | |
|---|---|---|---|---|---|---|---|---|
| 155 | 98 | 219324 | 458 | 500 | 1387 | 1336 | 3 | TWAG11600306 |
#가장 추천이 적게 된 곡
songs_data[songs_data['song_id'] == 8535]
| song_id | length | genre | artist | composer | lyricist | language | isrc | |
|---|---|---|---|---|---|---|---|---|
| 3270 | 8535 | 306050 | 465 | 431 | 1495 | 1452 | 17 | JPVI01527970 |
# 추천 성공 / 실패 트랜잭션
success = transac_data[transac_data['listen'] == 1]
fail= transac_data[transac_data['listen'] == 0]
rec_success = success['song_id'].value_counts()
rec_success
38 5981
98 5976
266 5490
2 5121
150 5100
...
9189 11
7867 11
1878 11
4975 8
9387 7
Name: song_id, Length: 4878, dtype: int64
#추천 성공률
rec_rate = pd.merge(rec_data, rec_success, left_index=True, right_index=True, how='left')
rec_rate['rec_rate'] = round(rec_rate['song_id_y'] / rec_rate['song_id_x'] *100,2)
rec_rate.sort_values(by=['rec_rate'], axis=0,inplace=True, ascending=False)
rec_rate.columns = ['all', 'suc', 'rec_rate']
rec_rate
| all | suc | rec_rate | |
|---|---|---|---|
| 431 | 1564 | 1374 | 87.85 |
| 38 | 7085 | 5981 | 84.42 |
| 98 | 7254 | 5976 | 82.38 |
| 716 | 3485 | 2806 | 80.52 |
| 2 | 6400 | 5121 | 80.02 |
| ... | ... | ... | ... |
| 7867 | 71 | 11 | 15.49 |
| 3205 | 92 | 14 | 15.22 |
| 6549 | 86 | 13 | 15.12 |
| 9387 | 55 | 7 | 12.73 |
| 4975 | 76 | 8 | 10.53 |
4878 rows × 3 columns
#추천 성공률이 높은 곡
songs_data[songs_data['song_id'] == 431]
| song_id | length | genre | artist | composer | lyricist | language | isrc | |
|---|---|---|---|---|---|---|---|---|
| 2272 | 431 | 234161 | 458 | 834 | 2201 | 1689 | 3 | TWAE31700062 |
songs_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4878 entries, 0 to 4877 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 song_id 4878 non-null int64 1 length 4878 non-null int64 2 genre 4878 non-null int64 3 artist 4878 non-null int64 4 composer 4878 non-null int64 5 lyricist 4878 non-null int64 6 language 4878 non-null int64 7 isrc 4878 non-null object dtypes: int64(7), object(1) memory usage: 305.0+ KB
songs_data.head()
| song_id | length | genre | artist | composer | lyricist | language | isrc | |
|---|---|---|---|---|---|---|---|---|
| 0 | 5981 | 273554 | 465 | 330 | 1787 | 1533 | 3 | TWC950206108 |
| 1 | 5767 | 228623 | 465 | 633 | 484 | 1057 | 3 | TWA531398021 |
| 2 | 8128 | 257602 | 458 | 777 | 1916 | 2115 | 3 | TWAY31508002 |
| 3 | 1992 | 199157 | 458 | 548 | 1011 | 1904 | 3 | TWA471596001 |
| 4 | 2665 | 246015 | 465 | 671 | 1407 | 1362 | 3 | TWK231306302 |
#1) object로 형변환(genre / artist / composer / lyricist / language)
songs_data['song_id'] = songs_data['song_id'].astype(object)
songs_data['genre'] = songs_data['genre'].astype(object)
songs_data['artist'] = songs_data['artist'].astype(object)
songs_data['composer'] = songs_data['composer'].astype(object)
songs_data['lyricist'] = songs_data['lyricist'].astype(object)
songs_data['language'] = songs_data['language'].astype(object)
#형변환 완료 확인
songs_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4878 entries, 0 to 4877 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 song_id 4878 non-null object 1 length 4878 non-null int64 2 genre 4878 non-null object 3 artist 4878 non-null object 4 composer 4878 non-null object 5 lyricist 4878 non-null object 6 language 4878 non-null object 7 isrc 4878 non-null object dtypes: int64(1), object(7) memory usage: 305.0+ KB
train_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7000 entries, 0 to 6999 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 7000 non-null int64 1 city 7000 non-null int64 2 age 7000 non-null int64 3 reg_method 7000 non-null int64 4 reg_date 7000 non-null int64 5 ex_date 7000 non-null int64 6 gender 7000 non-null object dtypes: int64(6), object(1) memory usage: 382.9+ KB
train_data.head()
| user_id | city | age | reg_method | reg_date | ex_date | gender | |
|---|---|---|---|---|---|---|---|
| 0 | 13827 | 14 | 29 | 9 | 20130112 | 20180203 | female |
| 1 | 6618 | 13 | 19 | 4 | 20160313 | 20180106 | female |
| 2 | 7615 | 22 | 33 | 9 | 20060314 | 20170930 | female |
| 3 | 15535 | 13 | 48 | 9 | 20110402 | 20171003 | female |
| 4 | 17708 | 18 | 20 | 4 | 20160102 | 20161105 | female |
#1) object로 형변환(city / reg_method)
train_data['city'] = train_data['city'].astype(object)
train_data['reg_method'] = train_data['reg_method'].astype(object)
#2) datetime으로 형변환(reg_date / ex_date)
train_data['reg_date'] = train_data['reg_date'].astype(str)
train_data['reg_date'] = train_data['reg_date'].str[0:4] + "-" +train_data['reg_date'].str[4:6] + "-" + train_data['reg_date'].str[6:8]
train_data['reg_date'] = train_data['reg_date'].astype('datetime64[ns]')
train_data['ex_date'] = train_data['ex_date'].astype(str)
train_data['ex_date'] = train_data['ex_date'].str[0:4] + "-" +train_data['ex_date'].str[4:6] + "-" + train_data['ex_date'].str[6:8]
train_data['ex_date'] = train_data['ex_date'].astype('datetime64[ns]')
#형변환 완료 확인
train_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7000 entries, 0 to 6999 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 7000 non-null int64 1 city 7000 non-null object 2 age 7000 non-null int64 3 reg_method 7000 non-null object 4 reg_date 7000 non-null datetime64[ns] 5 ex_date 7000 non-null datetime64[ns] 6 gender 7000 non-null object dtypes: datetime64[ns](2), int64(2), object(3) memory usage: 382.9+ KB
test_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1679 entries, 0 to 1678 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 1679 non-null int64 1 city 1679 non-null int64 2 age 1679 non-null int64 3 reg_method 1679 non-null int64 4 reg_date 1679 non-null int64 5 ex_date 1679 non-null int64 6 gender 0 non-null float64 dtypes: float64(1), int64(6) memory usage: 91.9 KB
test_data.head()
| user_id | city | age | reg_method | reg_date | ex_date | gender | |
|---|---|---|---|---|---|---|---|
| 0 | 16169 | 13 | 21 | 3 | 20120118 | 20180604 | NaN |
| 1 | 4694 | 13 | 0 | 3 | 20150110 | 20171014 | NaN |
| 2 | 10654 | 1 | 18 | 3 | 20150402 | 20180312 | NaN |
| 3 | 9327 | 14 | 16 | 4 | 20160824 | 20171002 | NaN |
| 4 | 3631 | 22 | 26 | 9 | 20100927 | 20171225 | NaN |
#1) object로 형변환(city / reg_method)
test_data['city'] = test_data['city'].astype(object)
test_data['reg_method'] = test_data['reg_method'].astype(object)
#2) datetime으로 형변환(reg_date / ex_date)
test_data['reg_date'] = test_data['reg_date'].astype(str)
test_data['reg_date'] = test_data['reg_date'].str[0:4] + "-" +test_data['reg_date'].str[4:6] + "-" + test_data['reg_date'].str[6:8]
test_data['reg_date'] = test_data['reg_date'].astype('datetime64[ns]')
test_data['ex_date'] = test_data['ex_date'].astype(str)
test_data['ex_date'] = test_data['ex_date'].str[0:4] + "-" +test_data['ex_date'].str[4:6] + "-" + test_data['ex_date'].str[6:8]
test_data['ex_date'] = test_data['ex_date'].astype('datetime64[ns]')
#형변환 완료 확인
test_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1679 entries, 0 to 1678 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 1679 non-null int64 1 city 1679 non-null object 2 age 1679 non-null int64 3 reg_method 1679 non-null object 4 reg_date 1679 non-null datetime64[ns] 5 ex_date 1679 non-null datetime64[ns] 6 gender 0 non-null float64 dtypes: datetime64[ns](2), float64(1), int64(2), object(2) memory usage: 91.9+ KB
train_data.describe()
| user_id | age | |
|---|---|---|
| count | 7000.000000 | 7000.000000 |
| mean | 11160.084571 | 28.128857 |
| std | 6904.768964 | 14.844875 |
| min | 1.000000 | 0.000000 |
| 25% | 5224.250000 | 22.000000 |
| 50% | 10691.500000 | 27.000000 |
| 75% | 16693.000000 | 33.000000 |
| max | 24977.000000 | 931.000000 |
train_data['count']=1
train_data['gender'].value_counts()
male 3600 female 3400 Name: gender, dtype: int64
# train data 남녀 분포
fig = px.pie(train_data, values='count', names='gender')
fig.show()
# train data 나이 분포
sns.countplot(x='age',data=train_data, palette='Set2')
plt.show()
#이상값 age가 0살이거나 110세 이상인 경우
age_outlier = train_data[(train_data['age'] < 1) | (train_data['age'] > 110) ]
age_outlier
| user_id | city | age | reg_method | reg_date | ex_date | gender | count | |
|---|---|---|---|---|---|---|---|---|
| 24 | 22786 | 1 | 0 | 3 | 2014-01-16 | 2017-06-11 | female | 1 |
| 122 | 16328 | 18 | 0 | 3 | 2015-02-15 | 2017-09-17 | female | 1 |
| 149 | 18779 | 5 | 0 | 7 | 2013-10-07 | 2017-11-25 | female | 1 |
| 151 | 1115 | 4 | 0 | 4 | 2016-06-15 | 2017-09-26 | female | 1 |
| 153 | 21698 | 4 | 0 | 9 | 2004-03-28 | 2017-09-26 | female | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6842 | 17981 | 5 | 0 | 9 | 2015-03-22 | 2017-08-09 | female | 1 |
| 6915 | 21227 | 1 | 0 | 3 | 2015-02-05 | 2016-11-29 | female | 1 |
| 6969 | 16355 | 13 | 0 | 9 | 2005-10-09 | 2017-09-30 | male | 1 |
| 6978 | 5877 | 22 | 0 | 9 | 2015-01-28 | 2017-09-14 | male | 1 |
| 6989 | 22281 | 1 | 0 | 7 | 2010-10-05 | 2017-04-20 | female | 1 |
215 rows × 8 columns
#이상값 age>110 확인
train_data[train_data['age'] > 110]
| user_id | city | age | reg_method | reg_date | ex_date | gender | count | |
|---|---|---|---|---|---|---|---|---|
| 1802 | 13805 | 15 | 131 | 7 | 2010-12-16 | 2017-09-16 | female | 1 |
| 6767 | 1018 | 22 | 931 | 9 | 2011-10-17 | 2017-10-20 | male | 1 |
#age 이상값 성별분포 확인
age_outlier.groupby('gender').count()
| user_id | city | age | reg_method | reg_date | ex_date | count | |
|---|---|---|---|---|---|---|---|
| gender | |||||||
| female | 121 | 121 | 121 | 121 | 121 | 121 | 121 |
| male | 94 | 94 | 94 | 94 | 94 | 94 | 94 |
def func_age(row):
if row['age'] < 1 or row['age'] > 110:
return 27
else:
return row['age']
train_data['age'] = train_data.apply(func_age, axis=1)
#이상값 대체 확인
train_data[(train_data['age'] < 1) | (train_data['age'] > 110) ]
| user_id | city | age | reg_method | reg_date | ex_date | gender | count |
|---|
# train data 나이 분포 다시 확인
sns.countplot(x='age',data=train_data, palette='Set2')
plt.show()
transac_data.head()
| user_id | song_id | rec_loc | rec_screen | entry | listen | |
|---|---|---|---|---|---|---|
| 0 | 5 | 5 | discover | Online playlist more | online-playlist | 0 |
| 1 | 7 | 7 | search | Search | song | 1 |
| 2 | 14 | 14 | discover | Online playlist more | song-based-playlist | 0 |
| 3 | 22 | 22 | my library | Local playlist more | local-library | 0 |
| 4 | 24 | 24 | search | Online playlist more | online-playlist | 0 |
transac_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1819855 entries, 0 to 1819854 Data columns (total 6 columns): # Column Dtype --- ------ ----- 0 user_id object 1 song_id object 2 rec_loc object 3 rec_screen object 4 entry object 5 listen object dtypes: object(6) memory usage: 83.3+ MB
transac_data['count']=1
transac_data
| user_id | song_id | rec_loc | rec_screen | entry | listen | count | |
|---|---|---|---|---|---|---|---|
| 0 | 5 | 5 | discover | Online playlist more | online-playlist | 0 | 1 |
| 1 | 7 | 7 | search | Search | song | 1 | 1 |
| 2 | 14 | 14 | discover | Online playlist more | song-based-playlist | 0 | 1 |
| 3 | 22 | 22 | my library | Local playlist more | local-library | 0 | 1 |
| 4 | 24 | 24 | search | Online playlist more | online-playlist | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1819850 | 6273 | 6415 | my library | Local playlist more | local-library | 0 | 1 |
| 1819851 | 4505 | 8831 | my library | Local playlist more | local-library | 0 | 1 |
| 1819852 | 682 | 1921 | radio | Radio | radio | 0 | 1 |
| 1819853 | 12056 | 7586 | search | Artist more | top-hits-for-artist | 0 | 1 |
| 1819854 | 9505 | 889 | my library | Local playlist more | local-library | 1 | 1 |
1819855 rows × 7 columns
# transac_data rec_loc 분포
transac_data['rec_loc'].value_counts()
my library 929302 discover 559231 search 131825 listen with 86494 radio 73250 explore 38104 notification 1440 settings 209 Name: rec_loc, dtype: int64
# transac_data rec_screen 분포
transac_data['rec_screen'].value_counts()
Local playlist more 873111 Online playlist more 378137 Others profile more 85748 Discover Chart 75455 Radio 74030 Discover Feature 65963 Album more 63203 Artist more 62534 Search 55473 My library 20032 Explore 19634 Unknown 17920 Discover Genre 11808 Discover New 5474 Search Home 5036 Search Trends 4283 My library_Search 1965 Self profile more 39 Concert 6 Payment 4 Name: rec_screen, dtype: int64
# transac_data entry 분포
transac_data['entry'].value_counts()
local-library 585531 online-playlist 550462 local-playlist 302692 listen-with 80370 radio 74491 album 68861 top-hits-for-artist 63781 song 47309 song-based-playlist 42481 topic-article-playlist 3411 artist 399 my-daily-playlist 67 Name: entry, dtype: int64
#m_transac_train = pd.merge(transac_data, train_data, on="user_id", how='left')
m_transac_train = pd.merge(transac_data, train_data, on="user_id", how='inner')
#중복된 count 속성 삭제 후 재추가
m_transac_train.drop('count_x', axis=1, inplace=True)
m_transac_train.drop('count_y', axis=1, inplace=True)
m_transac_train['count']=1
m_transac_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1019674 entries, 0 to 1019673 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 1019674 non-null object 1 song_id 1019674 non-null object 2 rec_loc 1019674 non-null object 3 rec_screen 1019674 non-null object 4 entry 1019674 non-null object 5 listen 1019674 non-null object 6 city 1019674 non-null object 7 age 1019674 non-null int64 8 reg_method 1019674 non-null object 9 reg_date 1019674 non-null datetime64[ns] 10 ex_date 1019674 non-null datetime64[ns] 11 gender 1019674 non-null object 12 count 1019674 non-null int64 dtypes: datetime64[ns](2), int64(2), object(9) memory usage: 108.9+ MB
m_transac_train
| user_id | song_id | rec_loc | rec_screen | entry | listen | city | age | reg_method | reg_date | ex_date | gender | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 5 | discover | Online playlist more | online-playlist | 0 | 5 | 20 | 9 | 2016-12-19 | 2017-09-11 | male | 1 |
| 1 | 5 | 4401 | radio | Radio | radio | 0 | 5 | 20 | 9 | 2016-12-19 | 2017-09-11 | male | 1 |
| 2 | 5 | 359 | search | Artist more | top-hits-for-artist | 0 | 5 | 20 | 9 | 2016-12-19 | 2017-09-11 | male | 1 |
| 3 | 5 | 1320 | search | Search | song | 0 | 5 | 20 | 9 | 2016-12-19 | 2017-09-11 | male | 1 |
| 4 | 5 | 2210 | discover | Online playlist more | online-playlist | 0 | 5 | 20 | 9 | 2016-12-19 | 2017-09-11 | male | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1019669 | 23874 | 175 | discover | Discover Chart | online-playlist | 0 | 13 | 37 | 9 | 2008-04-16 | 2017-09-27 | female | 1 |
| 1019670 | 23874 | 2369 | discover | Discover Chart | online-playlist | 0 | 13 | 37 | 9 | 2008-04-16 | 2017-09-27 | female | 1 |
| 1019671 | 18363 | 637 | radio | Radio | radio | 0 | 5 | 30 | 9 | 2004-11-30 | 2016-11-03 | male | 1 |
| 1019672 | 18363 | 7320 | radio | Radio | radio | 0 | 5 | 30 | 9 | 2004-11-30 | 2016-11-03 | male | 1 |
| 1019673 | 13559 | 1869 | my library | Album more | album | 1 | 8 | 23 | 9 | 2013-11-23 | 2017-06-09 | female | 1 |
1019674 rows × 13 columns
m_transac_train['gender'].value_counts()
male 526089 female 493585 Name: gender, dtype: int64
songs_data.head()
| song_id | length | genre | artist | composer | lyricist | language | isrc | |
|---|---|---|---|---|---|---|---|---|
| 0 | 5981 | 273554 | 465 | 330 | 1787 | 1533 | 3 | TWC950206108 |
| 1 | 5767 | 228623 | 465 | 633 | 484 | 1057 | 3 | TWA531398021 |
| 2 | 8128 | 257602 | 458 | 777 | 1916 | 2115 | 3 | TWAY31508002 |
| 3 | 1992 | 199157 | 458 | 548 | 1011 | 1904 | 3 | TWA471596001 |
| 4 | 2665 | 246015 | 465 | 671 | 1407 | 1362 | 3 | TWK231306302 |
mm_transac_train = pd.merge(m_transac_train, songs_data, on="song_id")
#중복된 count 속성 삭제 후 재추가
mm_transac_train.drop('count', axis=1, inplace=True)
mm_transac_train['count']=1
mm_transac_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1019674 entries, 0 to 1019673 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 1019674 non-null object 1 song_id 1019674 non-null object 2 rec_loc 1019674 non-null object 3 rec_screen 1019674 non-null object 4 entry 1019674 non-null object 5 listen 1019674 non-null object 6 city 1019674 non-null object 7 age 1019674 non-null int64 8 reg_method 1019674 non-null object 9 reg_date 1019674 non-null datetime64[ns] 10 ex_date 1019674 non-null datetime64[ns] 11 gender 1019674 non-null object 12 length 1019674 non-null int64 13 genre 1019674 non-null object 14 artist 1019674 non-null object 15 composer 1019674 non-null object 16 lyricist 1019674 non-null object 17 language 1019674 non-null object 18 isrc 1019674 non-null object 19 count 1019674 non-null int64 dtypes: datetime64[ns](2), int64(3), object(15) memory usage: 163.4+ MB
# 형변환
mm_transac_train['user_id'] = mm_transac_train['user_id'].astype(object)
mm_transac_train['song_id'] = mm_transac_train['song_id'].astype(object)
mm_transac_train['listen'] = mm_transac_train['listen'].astype(object)
df = mm_transac_train
df.describe()
| age | length | count | |
|---|---|---|---|
| count | 1.019674e+06 | 1.019674e+06 | 1019674.0 |
| mean | 2.802562e+01 | 2.504527e+05 | 1.0 |
| std | 8.149761e+00 | 3.967728e+04 | 0.0 |
| min | 3.000000e+00 | 4.354600e+04 | 1.0 |
| 25% | 2.300000e+01 | 2.236550e+05 | 1.0 |
| 50% | 2.700000e+01 | 2.498350e+05 | 1.0 |
| 75% | 3.100000e+01 | 2.756440e+05 | 1.0 |
| max | 1.050000e+02 | 7.584910e+05 | 1.0 |
#성별에 따른 장르 분포
fig = px.sunburst(df, path=['gender', 'genre'], values='count')
fig.show()
df
| user_id | song_id | rec_loc | rec_screen | entry | listen | city | age | reg_method | reg_date | ex_date | gender | length | genre | artist | composer | lyricist | language | isrc | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 5 | discover | Online playlist more | online-playlist | 0 | 5 | 20 | 9 | 2016-12-19 | 2017-09-11 | male | 219150 | 465 | 55 | 960 | 520 | 17 | JPB601402020 | 1 |
| 1 | 314 | 5 | discover | Online playlist more | online-playlist | 1 | 8 | 30 | 3 | 2015-12-27 | 2017-09-27 | male | 219150 | 465 | 55 | 960 | 520 | 17 | JPB601402020 | 1 |
| 2 | 534 | 5 | my library | Local playlist more | local-library | 1 | 4 | 27 | 9 | 2006-09-22 | 2017-10-02 | male | 219150 | 465 | 55 | 960 | 520 | 17 | JPB601402020 | 1 |
| 3 | 768 | 5 | discover | Discover Chart | online-playlist | 1 | 5 | 26 | 3 | 2013-11-14 | 2017-09-29 | female | 219150 | 465 | 55 | 960 | 520 | 17 | JPB601402020 | 1 |
| 4 | 1097 | 5 | discover | Discover Chart | online-playlist | 0 | 8 | 19 | 3 | 2014-10-18 | 2017-05-03 | male | 219150 | 465 | 55 | 960 | 520 | 17 | JPB601402020 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1019669 | 16766 | 4860 | my library | Album more | album | 0 | 18 | 26 | 9 | 2010-01-26 | 2017-10-12 | female | 296437 | 2157 | 676 | 1766 | 1789 | 10 | TWI431603234 | 1 |
| 1019670 | 17143 | 4860 | discover | Album more | album | 1 | 6 | 54 | 9 | 2007-01-31 | 2017-03-04 | male | 296437 | 2157 | 676 | 1766 | 1789 | 10 | TWI431603234 | 1 |
| 1019671 | 16096 | 4860 | my library | Local playlist more | album | 0 | 13 | 29 | 7 | 2011-08-02 | 2017-03-29 | male | 296437 | 2157 | 676 | 1766 | 1789 | 10 | TWI431603234 | 1 |
| 1019672 | 21698 | 4860 | explore | Online playlist more | online-playlist | 0 | 4 | 27 | 9 | 2004-03-28 | 2017-09-26 | female | 296437 | 2157 | 676 | 1766 | 1789 | 10 | TWI431603234 | 1 |
| 1019673 | 20125 | 4860 | my library | Local playlist more | local-library | 0 | 13 | 59 | 9 | 2017-01-28 | 2017-09-30 | male | 296437 | 2157 | 676 | 1766 | 1789 | 10 | TWI431603234 | 1 |
1019674 rows × 20 columns
df['isrc'] = df['isrc'].astype(str)
df['country_code'] = df['isrc'].str[0:2]
df['first_regist_code'] = df['isrc'].str[2:5]
df['year_of_referance'] = df['isrc'].str[5:7]
df['designation_code'] = df['isrc'].str[7:]
df
| user_id | song_id | rec_loc | rec_screen | entry | listen | city | age | reg_method | reg_date | ... | artist | composer | lyricist | language | isrc | count | country_code | first_regist_code | year_of_referance | designation_code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 5 | discover | Online playlist more | online-playlist | 0 | 5 | 20 | 9 | 2016-12-19 | ... | 55 | 960 | 520 | 17 | JPB601402020 | 1 | JP | B60 | 14 | 02020 |
| 1 | 314 | 5 | discover | Online playlist more | online-playlist | 1 | 8 | 30 | 3 | 2015-12-27 | ... | 55 | 960 | 520 | 17 | JPB601402020 | 1 | JP | B60 | 14 | 02020 |
| 2 | 534 | 5 | my library | Local playlist more | local-library | 1 | 4 | 27 | 9 | 2006-09-22 | ... | 55 | 960 | 520 | 17 | JPB601402020 | 1 | JP | B60 | 14 | 02020 |
| 3 | 768 | 5 | discover | Discover Chart | online-playlist | 1 | 5 | 26 | 3 | 2013-11-14 | ... | 55 | 960 | 520 | 17 | JPB601402020 | 1 | JP | B60 | 14 | 02020 |
| 4 | 1097 | 5 | discover | Discover Chart | online-playlist | 0 | 8 | 19 | 3 | 2014-10-18 | ... | 55 | 960 | 520 | 17 | JPB601402020 | 1 | JP | B60 | 14 | 02020 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1019669 | 16766 | 4860 | my library | Album more | album | 0 | 18 | 26 | 9 | 2010-01-26 | ... | 676 | 1766 | 1789 | 10 | TWI431603234 | 1 | TW | I43 | 16 | 03234 |
| 1019670 | 17143 | 4860 | discover | Album more | album | 1 | 6 | 54 | 9 | 2007-01-31 | ... | 676 | 1766 | 1789 | 10 | TWI431603234 | 1 | TW | I43 | 16 | 03234 |
| 1019671 | 16096 | 4860 | my library | Local playlist more | album | 0 | 13 | 29 | 7 | 2011-08-02 | ... | 676 | 1766 | 1789 | 10 | TWI431603234 | 1 | TW | I43 | 16 | 03234 |
| 1019672 | 21698 | 4860 | explore | Online playlist more | online-playlist | 0 | 4 | 27 | 9 | 2004-03-28 | ... | 676 | 1766 | 1789 | 10 | TWI431603234 | 1 | TW | I43 | 16 | 03234 |
| 1019673 | 20125 | 4860 | my library | Local playlist more | local-library | 0 | 13 | 59 | 9 | 2017-01-28 | ... | 676 | 1766 | 1789 | 10 | TWI431603234 | 1 | TW | I43 | 16 | 03234 |
1019674 rows × 24 columns
df['country_code'].value_counts()
TW 791964 HK 53213 US 46064 KR 29764 CN 25503 GB 22605 JP 11133 NO 9231 MY 4557 NL 3942 SG 3941 SE 3315 DE 3046 GT 2553 TG 1797 CA 1508 DK 1504 QM 1276 EM 927 MO 462 AU 359 FR 280 UK 233 NG 191 TC 131 CY 101 GL 40 TH 34 Name: country_code, dtype: int64
df['gender'].unique()
array(['male', 'female'], dtype=object)
agg_format={'age':'mean', 'length':'mean'}
df.groupby('gender').agg(agg_format)
| age | length | |
|---|---|---|
| gender | ||
| female | 28.190642 | 250020.416058 |
| male | 27.870786 | 250858.341855 |
agg_format={'age':'median', 'length':'median'}
df.groupby('gender').agg(agg_format)
| age | length | |
|---|---|---|
| gender | ||
| female | 27 | 249010 |
| male | 27 | 250253 |
#음원 발매 나라별 청취 도시 비율
fig = px.sunburst(df, path=['country_code', 'city'], values='count')
fig.show()
#청취 device 통계 생성
device = transac_data.groupby(['rec_loc', 'rec_screen','entry','listen'])['user_id'].count()
device.to_csv('device.csv')
발매 연도별 청취연령의 평균
def func_year(row):
if int(row['year_of_referance']) > 20:
return int(row['year_of_referance']) + 1900
else:
return int(row['year_of_referance']) + 2000
df['year_of_referance'] = df.apply(func_year, axis=1)
len(df['year_of_referance'].unique())
37
year_dist = df.groupby('year_of_referance').mean()
year_dist = pd.DataFrame(year_dist)
year_dist
# fig = px.line(year_dist, x="year_of_referance", y="n_age")
# fig.show()
| age | length | count | |
|---|---|---|---|
| year_of_referance | |||
| 1927 | 31.125000 | 289459.000000 | 1.0 |
| 1968 | 26.757009 | 275620.000000 | 1.0 |
| 1980 | 30.342857 | 310657.000000 | 1.0 |
| 1984 | 29.472222 | 302690.000000 | 1.0 |
| 1985 | 29.773333 | 263288.000000 | 1.0 |
| 1986 | 33.288235 | 198394.297059 | 1.0 |
| 1987 | 29.663793 | 255361.663793 | 1.0 |
| 1988 | 32.264798 | 263461.386293 | 1.0 |
| 1989 | 29.744565 | 262191.896739 | 1.0 |
| 1990 | 30.476596 | 282888.642553 | 1.0 |
| 1991 | 31.338798 | 299833.000000 | 1.0 |
| 1992 | 31.417112 | 269021.308378 | 1.0 |
| 1993 | 29.545263 | 306862.619649 | 1.0 |
| 1994 | 31.052738 | 286593.517241 | 1.0 |
| 1995 | 29.320628 | 282966.436099 | 1.0 |
| 1996 | 30.209119 | 282322.777778 | 1.0 |
| 1997 | 29.218176 | 278398.954411 | 1.0 |
| 1998 | 28.887542 | 269401.861170 | 1.0 |
| 1999 | 29.476810 | 257575.502085 | 1.0 |
| 2000 | 28.858168 | 262539.044146 | 1.0 |
| 2001 | 28.483621 | 265199.327038 | 1.0 |
| 2002 | 28.398612 | 260509.631904 | 1.0 |
| 2003 | 28.174489 | 255997.990587 | 1.0 |
| 2004 | 27.905534 | 265719.875234 | 1.0 |
| 2005 | 27.912200 | 255827.667221 | 1.0 |
| 2006 | 28.291326 | 257966.381006 | 1.0 |
| 2007 | 28.048811 | 255423.764290 | 1.0 |
| 2008 | 27.872759 | 261687.486866 | 1.0 |
| 2009 | 27.983808 | 257713.188898 | 1.0 |
| 2010 | 27.808410 | 257344.898107 | 1.0 |
| 2011 | 27.929324 | 260995.320671 | 1.0 |
| 2012 | 27.559251 | 249155.393687 | 1.0 |
| 2013 | 27.896090 | 258757.082813 | 1.0 |
| 2014 | 27.542313 | 251924.175455 | 1.0 |
| 2015 | 27.731263 | 246082.634961 | 1.0 |
| 2016 | 28.167318 | 242958.479015 | 1.0 |
| 2017 | 28.902467 | 241602.628416 | 1.0 |
fig = px.box(df, y="age")
fig.show()
year_dist = year_dist.reset_index()
#발매 연도별 청취연령 boxplot
fig = px.box(df, x="year_of_referance", y="age", color="year_of_referance")
fig.show()